{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright (C) 2016 - 2019 Pinard Liu(liujianping-ok@163.com)\n",
    "\n",
    "https://www.cnblogs.com/pinard\n",
    "\n",
    "Permission given to modify the code as long as you keep this declaration at the top\n",
    "\n",
    "用gensim学习word2vec https://www.cnblogs.com/pinard/p/7278324.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "\n",
    "import jieba\n",
    "import jieba.analyse\n",
    "\n",
    "jieba.suggest_freq('沙瑞金', True)\n",
    "jieba.suggest_freq('田国富', True)\n",
    "jieba.suggest_freq('高育良', True)\n",
    "jieba.suggest_freq('侯亮平', True)\n",
    "jieba.suggest_freq('钟小艾', True)\n",
    "jieba.suggest_freq('陈岩石', True)\n",
    "jieba.suggest_freq('欧阳菁', True)\n",
    "jieba.suggest_freq('易学习', True)\n",
    "jieba.suggest_freq('王大路', True)\n",
    "jieba.suggest_freq('蔡成功', True)\n",
    "jieba.suggest_freq('孙连城', True)\n",
    "jieba.suggest_freq('季昌明', True)\n",
    "jieba.suggest_freq('丁义珍', True)\n",
    "jieba.suggest_freq('郑西坡', True)\n",
    "jieba.suggest_freq('赵东来', True)\n",
    "jieba.suggest_freq('高小琴', True)\n",
    "jieba.suggest_freq('赵瑞龙', True)\n",
    "jieba.suggest_freq('林华华', True)\n",
    "jieba.suggest_freq('陆亦可', True)\n",
    "jieba.suggest_freq('刘新建', True)\n",
    "jieba.suggest_freq('刘庆祝', True)\n",
    "\n",
    "with open('./in_the_name_of_people.txt') as f:\n",
    "    document = f.read()\n",
    "    \n",
    "    #document_decode = document.decode('GBK')\n",
    "    \n",
    "    document_cut = jieba.cut(document)\n",
    "    #print  ' '.join(jieba_cut)  //如果打印结果，则分词效果消失，后面的result无法显示\n",
    "    result = ' '.join(document_cut)\n",
    "    result = result.encode('utf-8')\n",
    "    with open('./in_the_name_of_people_segment.txt', 'w') as f2:\n",
    "        f2.write(result)\n",
    "f.close()\n",
    "f2.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import modules & set up logging\n",
    "import logging\n",
    "import os\n",
    "from gensim.models import word2vec\n",
    "\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
    "\n",
    "sentences = word2vec.LineSentence('./in_the_name_of_people_segment.txt') \n",
    "\n",
    "model = word2vec.Word2Vec(sentences, hs=1,min_count=1,window=3,size=100)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "祁同伟 0.977946519852\n",
      "赵东来 0.969956457615\n",
      "侯亮平 0.96396112442\n",
      "沙瑞金 0.959288835526\n",
      "易学习 0.95690870285\n"
     ]
    }
   ],
   "source": [
    "req_count = 5\n",
    "for key in model.wv.similar_by_word('李达康'.decode('utf-8'), topn =100):\n",
    "    if len(key[0])==3:\n",
    "        req_count -= 1\n",
    "        print key[0], key[1]\n",
    "        if req_count == 0:\n",
    "            break;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "李达康 0.969956457615\n",
      "陆亦可 0.969528734684\n",
      "赵瑞龙 0.966745913029\n",
      "祁同伟 0.965694904327\n",
      "蔡成功 0.961919903755\n"
     ]
    }
   ],
   "source": [
    "req_count = 5\n",
    "for key in model.wv.similar_by_word('赵东来'.decode('utf-8'), topn =100):\n",
    "    if len(key[0])==3:\n",
    "        req_count -= 1\n",
    "        print key[0], key[1]\n",
    "        if req_count == 0:\n",
    "            break;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "沙瑞金 0.961137533188\n",
      "开玩笑 0.935248732567\n",
      "祁同伟 0.931046843529\n",
      "李达康 0.921290516853\n",
      "打电话 0.916399776936\n"
     ]
    }
   ],
   "source": [
    "req_count = 5\n",
    "for key in model.wv.similar_by_word('高育良'.decode('utf-8'), topn =100):\n",
    "    if len(key[0])==3:\n",
    "        req_count -= 1\n",
    "        print key[0], key[1]\n",
    "        if req_count == 0:\n",
    "            break;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "高育良 0.967257142067\n",
      "李达康 0.959131598473\n",
      "田国富 0.953414440155\n",
      "易学习 0.943500876427\n",
      "祁同伟 0.942932963371\n"
     ]
    }
   ],
   "source": [
    "req_count = 5\n",
    "for key in model.wv.similar_by_word('沙瑞金'.decode('utf-8'), topn =100):\n",
    "    if len(key[0])==3:\n",
    "        req_count -= 1\n",
    "        print key[0], key[1]\n",
    "        if req_count == 0:\n",
    "            break;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.961137455325\n",
      "0.935589365706\n"
     ]
    }
   ],
   "source": [
    "print model.wv.similarity('沙瑞金'.decode('utf-8'), '高育良'.decode('utf-8'))\n",
    "print model.wv.similarity('李达康'.decode('utf-8'), '王大路'.decode('utf-8'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "刘庆祝\n"
     ]
    }
   ],
   "source": [
    "print model.wv.doesnt_match(u\"沙瑞金 高育良 李达康 刘庆祝\".split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    " "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
